import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import folium
from folium.plugins import HeatMap
import plotly.express as px
from datetime import datetime
import re
from wordcloud import WordCloud
from matplotlib import style
style.use("ggplot")
import warnings
warnings.simplefilter("ignore", UserWarning)
#pd.set_option("display.max_columns", None)
#pd.set_option('display.max_colwidth', 100)
#pd.set_option("display.max_rows", None)
filename = 'D:\sales_data.csv'
df = pd.read_csv(filename)
df.head()
| Order Date | Order ID | Product | Product_ean | Category | Purchase Address | Quantity Ordered | Price Each | Cost price | turnover | margin | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1/22/2019 21:25 | 141234 | iPhone | 5.640000e+12 | Phones | 944 Walnut St, Boston, MA 02215 | 1 | 700.00 | 231.0000 | 700.00 | 469.0000 |
| 1 | 1/28/2019 14:15 | 141235 | Lightning Charging Cable | 5.560000e+12 | Accessories | 185 Maple St, Portland, OR 97035 | 1 | 14.95 | 7.4750 | 14.95 | 7.4750 |
| 2 | 1/17/2019 13:33 | 141236 | Wired Headphones | 2.110000e+12 | Accessories | 538 Adams St, San Francisco, CA 94016 | 2 | 11.99 | 5.9950 | 23.98 | 11.9900 |
| 3 | 1/5/2019 20:33 | 141237 | 27in FHD Monitor | 3.070000e+12 | NaN | 738 10th St, Los Angeles, CA 90001 | 1 | 149.99 | 97.4935 | 149.99 | 52.4965 |
| 4 | 1/25/2019 11:59 | 141238 | Wired Headphones | 9.690000e+12 | Accessories | 387 10th St, Austin, TX 73301 | 1 | 11.99 | 5.9950 | 11.99 | 5.9950 |
df.columns = df.columns.str.replace(' ', '_')
df.head(0)
| Order_Date | Order_ID | Product | Product_ean | Category | Purchase_Address | Quantity_Ordered | Price_Each | Cost_price | turnover | margin |
|---|
df.shape
(185966, 11)
This dataframe originally contains 185966 records and 11 features.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 185966 entries, 0 to 185965 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order_Date 185966 non-null object 1 Order_ID 185966 non-null int64 2 Product 185966 non-null object 3 Product_ean 185966 non-null float64 4 Category 185366 non-null object 5 Purchase_Address 185966 non-null object 6 Quantity_Ordered 185966 non-null int64 7 Price_Each 185734 non-null float64 8 Cost_price 185966 non-null float64 9 turnover 185966 non-null float64 10 margin 185966 non-null float64 dtypes: float64(5), int64(2), object(4) memory usage: 15.6+ MB
df.describe(include = 'all')
| Order_Date | Order_ID | Product | Product_ean | Category | Purchase_Address | Quantity_Ordered | Price_Each | Cost_price | turnover | margin | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 185966 | 185966.000000 | 185966 | 1.859660e+05 | 185366 | 185966 | 185966.000000 | 185734.000000 | 185966.000000 | 185966.000000 | 185966.000000 |
| unique | 142395 | NaN | 19 | NaN | 4 | 140787 | NaN | NaN | NaN | NaN | NaN |
| top | 12/15/2019 20:16 | NaN | USB-C Charging Cable | NaN | Accessories | 193 Forest St, San Francisco, CA 94016 | NaN | NaN | NaN | NaN | NaN |
| freq | 8 | NaN | 21905 | NaN | 134168 | 9 | NaN | NaN | NaN | NaN | NaN |
| mean | NaN | 230414.910914 | NaN | 5.509176e+12 | NaN | NaN | 1.124388 | 184.040854 | 69.670555 | 185.496050 | 115.292602 |
| std | NaN | 51513.524171 | NaN | 2.598419e+12 | NaN | NaN | 0.442801 | 332.712540 | 109.427028 | 332.928676 | 225.233281 |
| min | NaN | 141234.000000 | NaN | 1.000000e+12 | NaN | NaN | 1.000000 | 2.990000 | 1.495000 | 2.990000 | 1.495000 |
| 25% | NaN | 185828.250000 | NaN | 3.250000e+12 | NaN | NaN | 1.000000 | 11.950000 | 5.975000 | 11.950000 | 5.975000 |
| 50% | NaN | 230368.500000 | NaN | 5.510000e+12 | NaN | NaN | 1.000000 | 14.950000 | 7.475000 | 14.950000 | 7.475000 |
| 75% | NaN | 275032.000000 | NaN | 7.770000e+12 | NaN | NaN | 1.000000 | 150.000000 | 97.500000 | 150.000000 | 52.500000 |
| max | NaN | 319670.000000 | NaN | 1.000000e+13 | NaN | NaN | 9.000000 | 1700.000000 | 561.000000 | 3400.000000 | 2278.000000 |
print(df.dtypes)
Order_Date object Order_ID int64 Product object Product_ean float64 Category object Purchase_Address object Quantity_Ordered int64 Price_Each float64 Cost_price float64 turnover float64 margin float64 dtype: object
df['Order_Date'] = pd.to_datetime(df['Order_Date'])
df['Product'] = df['Product'].astype('category')
df['Category'] = df['Category'].astype('category')
print(df.dtypes)
Order_Date datetime64[ns] Order_ID int64 Product category Product_ean float64 Category category Purchase_Address object Quantity_Ordered int64 Price_Each float64 Cost_price float64 turnover float64 margin float64 dtype: object
df.isnull().sum()
Order_Date 0 Order_ID 0 Product 0 Product_ean 0 Category 600 Purchase_Address 0 Quantity_Ordered 0 Price_Each 232 Cost_price 0 turnover 0 margin 0 dtype: int64
There are 600 null values in the Category feature while there are 232 null values in the Price_Each feature.
total_miss = df.isnull().sum()
percent_miss = (total_miss/df.isnull().count()*100)
missing_data = pd.DataFrame({'Total missing':total_miss,'% missing':percent_miss})
missing_data.sort_values(by='Total missing',ascending=False)
| Total missing | % missing | |
|---|---|---|
| Category | 600 | 0.322640 |
| Price_Each | 232 | 0.124754 |
| Order_Date | 0 | 0.000000 |
| Order_ID | 0 | 0.000000 |
| Product | 0 | 0.000000 |
| Product_ean | 0 | 0.000000 |
| Purchase_Address | 0 | 0.000000 |
| Quantity_Ordered | 0 | 0.000000 |
| Cost_price | 0 | 0.000000 |
| turnover | 0 | 0.000000 |
| margin | 0 | 0.000000 |
The percentage of the null values in the Category and the Price_Each feature are significantly low 0.32% and 0.12% respectively. But due to the nature of the data being records of purchase we can't simply drop or fill with the mean.
def print_null_info(Dataframe,col_name):
print(f"Percentage of Nulls in {col_name} = ",
(Dataframe[col_name].isnull().sum() / Dataframe[col_name].shape[0])*100,' %')
import missingno as msno
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
msno.bar(df)
<Axes: >
msno.matrix(df.sort_values(by='Category'))
<Axes: >
There doesn't seem to be a relation betweem the null values.
msno.heatmap(df,cmap='GnBu')
<Axes: >
df['Product'].value_counts()
USB-C Charging Cable 21905 Lightning Charging Cable 21660 AAA Batteries (4-pack) 20642 AA Batteries (4-pack) 20578 Wired Headphones 18883 Apple Airpods Headphones 15552 Bose SoundSport Headphones 13327 27in FHD Monitor 7507 iPhone 6843 27in 4K Gaming Monitor 6231 34in Ultrawide Monitor 6181 Google Phone 5525 Flatscreen TV 4800 Macbook Pro Laptop 4725 ThinkPad Laptop 4128 20in Monitor 4101 Vareebadd Phone 2066 LG Washing Machine 666 LG Dryer 646 Name: Product, dtype: int64
df.query("Product == 'LG Dryer'").isnull().sum()
Order_Date 0 Order_ID 0 Product 0 Product_ean 0 Category 3 Purchase_Address 0 Quantity_Ordered 0 Price_Each 168 Cost_price 0 turnover 0 margin 0 dtype: int64
There are 3 null values in Category column and 168 null values in Price_Each column.
df.query("Product == '27in 4K Gaming Monitor'").isnull().sum()
Order_Date 0 Order_ID 0 Product 0 Product_ean 0 Category 21 Purchase_Address 0 Quantity_Ordered 0 Price_Each 0 Cost_price 0 turnover 0 margin 0 dtype: int64
There are 21 null values in Category column.
df[(df['Order_Date'] == pd.Timestamp(2019,5,6))].isnull().sum()
Order_Date 0 Order_ID 0 Product 0 Product_ean 0 Category 0 Purchase_Address 0 Quantity_Ordered 0 Price_Each 0 Cost_price 0 turnover 0 margin 0 dtype: int64
There are no null values in the date 5/06/2019 .
category_df = df[['Product', 'Category']].copy()
category_df.head()
| Product | Category | |
|---|---|---|
| 0 | iPhone | Phones |
| 1 | Lightning Charging Cable | Accessories |
| 2 | Wired Headphones | Accessories |
| 3 | 27in FHD Monitor | NaN |
| 4 | Wired Headphones | Accessories |
category_dict = dict(category_df.values)
df.Category = df.Category.fillna(df.Product.map(category_dict))
print_null_info(df, 'Category')
Percentage of Nulls in Category = 0.0 %
Now there are no null values in the Category feature. They have been replaced by their equivalent value according to the product name.
product_price_df = df[['Product', 'Price_Each']].copy()
product_price_df.head()
| Product | Price_Each | |
|---|---|---|
| 0 | iPhone | 700.00 |
| 1 | Lightning Charging Cable | 14.95 |
| 2 | Wired Headphones | 11.99 |
| 3 | 27in FHD Monitor | 149.99 |
| 4 | Wired Headphones | 11.99 |
product_price_dict = dict(product_price_df.values)
df.Price_Each = df.Price_Each.fillna(df.Product.map(product_price_dict))
print_null_info(df, 'Price_Each')
Percentage of Nulls in Price_Each = 0.0 %
Here I have replaced the null values in the Price_Each feature with their respective values from the product price dataframe that I have created above.
df.duplicated().sum()
17
There are 17 duplicate records.
df.duplicated(subset=['Order_ID']).sum()
7529
There are 7529 duplicate Order_ID values. After investigating the data I have foound that the repeated Order_ID values are the same buyer purchasing different products So we must keep those records.
df.drop_duplicates(keep='last',inplace=True)
df.duplicated().sum()
0
Now there aren't any duplicate records.
(df['turnover'] == (df['Price_Each'] * df['Quantity_Ordered'])).value_counts()
(df['margin'] == (df['turnover'] - (df['Cost_price']* df['Quantity_Ordered']))).value_counts()
True 185523 False 426 dtype: int64
True 167658 False 18291 dtype: int64
When checking the equality of the above columns for some reason there are not 100% identical. This may arise from the interpetation of the float values in the memory itself.
I have decided to redo the columns based on the Quantity_Ordered and the Price_Each features so that the Features are homogenous.
df.head(0)
| Order_Date | Order_ID | Product | Product_ean | Category | Purchase_Address | Quantity_Ordered | Price_Each | Cost_price | turnover | margin |
|---|
df['turnover'] = (df['Price_Each'] * df['Quantity_Ordered'])
df['margin'] = (df['turnover'] - (df['Cost_price']* df['Quantity_Ordered']))
df.head()
| Order_Date | Order_ID | Product | Product_ean | Category | Purchase_Address | Quantity_Ordered | Price_Each | Cost_price | turnover | margin | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-01-22 21:25:00 | 141234 | iPhone | 5.640000e+12 | Phones | 944 Walnut St, Boston, MA 02215 | 1 | 700.00 | 231.0000 | 700.00 | 469.0000 |
| 1 | 2019-01-28 14:15:00 | 141235 | Lightning Charging Cable | 5.560000e+12 | Accessories | 185 Maple St, Portland, OR 97035 | 1 | 14.95 | 7.4750 | 14.95 | 7.4750 |
| 2 | 2019-01-17 13:33:00 | 141236 | Wired Headphones | 2.110000e+12 | Accessories | 538 Adams St, San Francisco, CA 94016 | 2 | 11.99 | 5.9950 | 23.98 | 11.9900 |
| 3 | 2019-01-05 20:33:00 | 141237 | 27in FHD Monitor | 3.070000e+12 | Appliances | 738 10th St, Los Angeles, CA 90001 | 1 | 149.99 | 97.4935 | 149.99 | 52.4965 |
| 4 | 2019-01-25 11:59:00 | 141238 | Wired Headphones | 9.690000e+12 | Accessories | 387 10th St, Austin, TX 73301 | 1 | 11.99 | 5.9950 | 11.99 | 5.9950 |
df['turnover'].equals(df['Price_Each'] * df['Quantity_Ordered'])
df['margin'].equals((df['turnover'] - (df['Cost_price'] * df['Quantity_Ordered'])))
True
True
Now the columns are homogenous.
city_string = df['Purchase_Address'].str.split(',\s*')
df['City'] = city_string.str[1]
Here I have extracted the City name from the Purchase_Address feature.
state_string = df['Purchase_Address'].str.split(',\s*')
df['State_Code'] = state_string.str[-1]
modified_state_string = df['State_Code'].str.replace('\d+', '', regex = True)
df['State_abbr'] = modified_state_string.str[0:2]
Here I have engineered two new features from the Purchase_Address feature which are State_Code and State_abbr .
df['Day_Name'] = df['Order_Date'].dt.day_name()
df['Month'] = df['Order_Date'].dt.month_name()
df['Year'] = df['Order_Date'].dt.year
Here I have engineered three new features from the Order_Date feature which are dayname, month and year.
df.head()
| Order_Date | Order_ID | Product | Product_ean | Category | Purchase_Address | Quantity_Ordered | Price_Each | Cost_price | turnover | margin | City | State_Code | State_abbr | Day_Name | Month | Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-01-22 21:25:00 | 141234 | iPhone | 5.640000e+12 | Phones | 944 Walnut St, Boston, MA 02215 | 1 | 700.00 | 231.0000 | 700.00 | 469.0000 | Boston | MA 02215 | MA | Tuesday | January | 2019 |
| 1 | 2019-01-28 14:15:00 | 141235 | Lightning Charging Cable | 5.560000e+12 | Accessories | 185 Maple St, Portland, OR 97035 | 1 | 14.95 | 7.4750 | 14.95 | 7.4750 | Portland | OR 97035 | OR | Monday | January | 2019 |
| 2 | 2019-01-17 13:33:00 | 141236 | Wired Headphones | 2.110000e+12 | Accessories | 538 Adams St, San Francisco, CA 94016 | 2 | 11.99 | 5.9950 | 23.98 | 11.9900 | San Francisco | CA 94016 | CA | Thursday | January | 2019 |
| 3 | 2019-01-05 20:33:00 | 141237 | 27in FHD Monitor | 3.070000e+12 | Appliances | 738 10th St, Los Angeles, CA 90001 | 1 | 149.99 | 97.4935 | 149.99 | 52.4965 | Los Angeles | CA 90001 | CA | Saturday | January | 2019 |
| 4 | 2019-01-25 11:59:00 | 141238 | Wired Headphones | 9.690000e+12 | Accessories | 387 10th St, Austin, TX 73301 | 1 | 11.99 | 5.9950 | 11.99 | 5.9950 | Austin | TX 73301 | TX | Friday | January | 2019 |
quarters = {
'Quarter1': ['January', 'February', 'March'],
'Quarter2': ['April', 'May', 'June'],
'Quarter3': [ 'July', 'August', 'September'],
'Quarter4': ['October', 'November', 'December']
}
def month_to_quarter(month):
for quarter, months in quarters.items():
if month in months:
return quarter
df['Quarter'] = df['Month'].apply(month_to_quarter)
seasons = {
'Winter': ['December', 'January', 'February'],
'Spring': ['March', 'April', 'May'],
'Summer': ['June', 'July', 'August'],
'Autumn': ['September', 'October', 'November']
}
def month_to_season(month):
for season, months in seasons.items():
if month in months:
return season
df['Season'] = df['Month'].apply(month_to_season)
df.head(1)
| Order_Date | Order_ID | Product | Product_ean | Category | Purchase_Address | Quantity_Ordered | Price_Each | Cost_price | turnover | margin | City | State_Code | State_abbr | Day_Name | Month | Year | Quarter | Season | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-01-22 21:25:00 | 141234 | iPhone | 5.640000e+12 | Phones | 944 Walnut St, Boston, MA 02215 | 1 | 700.0 | 231.0 | 700.0 | 469.0 | Boston | MA 02215 | MA | Tuesday | January | 2019 | Quarter1 | Winter |
df.shape
(185949, 19)
Now the dataframe has 8 new features (City, State_Code, State_abbr, Day_Name, Month, Year, Quarter, Season).
cols = list(df.columns.values)
cols
['Order_Date', 'Order_ID', 'Product', 'Product_ean', 'Category', 'Purchase_Address', 'Quantity_Ordered', 'Price_Each', 'Cost_price', 'turnover', 'margin', 'City', 'State_Code', 'State_abbr', 'Day_Name', 'Month', 'Year', 'Quarter', 'Season']
df = df[['Order_ID', 'Order_Date', 'Day_Name', 'Month', 'Year', 'Quarter', 'Season',
'Product_ean', 'Purchase_Address', 'City', 'State_abbr', 'State_Code',
'Category', 'Product', 'Price_Each', 'Quantity_Ordered', 'turnover', 'Cost_price', 'margin']]
df.head()
| Order_ID | Order_Date | Day_Name | Month | Year | Quarter | Season | Product_ean | Purchase_Address | City | State_abbr | State_Code | Category | Product | Price_Each | Quantity_Ordered | turnover | Cost_price | margin | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 141234 | 2019-01-22 21:25:00 | Tuesday | January | 2019 | Quarter1 | Winter | 5.640000e+12 | 944 Walnut St, Boston, MA 02215 | Boston | MA | MA 02215 | Phones | iPhone | 700.00 | 1 | 700.00 | 231.0000 | 469.0000 |
| 1 | 141235 | 2019-01-28 14:15:00 | Monday | January | 2019 | Quarter1 | Winter | 5.560000e+12 | 185 Maple St, Portland, OR 97035 | Portland | OR | OR 97035 | Accessories | Lightning Charging Cable | 14.95 | 1 | 14.95 | 7.4750 | 7.4750 |
| 2 | 141236 | 2019-01-17 13:33:00 | Thursday | January | 2019 | Quarter1 | Winter | 2.110000e+12 | 538 Adams St, San Francisco, CA 94016 | San Francisco | CA | CA 94016 | Accessories | Wired Headphones | 11.99 | 2 | 23.98 | 5.9950 | 11.9900 |
| 3 | 141237 | 2019-01-05 20:33:00 | Saturday | January | 2019 | Quarter1 | Winter | 3.070000e+12 | 738 10th St, Los Angeles, CA 90001 | Los Angeles | CA | CA 90001 | Appliances | 27in FHD Monitor | 149.99 | 1 | 149.99 | 97.4935 | 52.4965 |
| 4 | 141238 | 2019-01-25 11:59:00 | Friday | January | 2019 | Quarter1 | Winter | 9.690000e+12 | 387 10th St, Austin, TX 73301 | Austin | TX | TX 73301 | Accessories | Wired Headphones | 11.99 | 1 | 11.99 | 5.9950 | 5.9950 |
Here I have reordered the columns position for better legibility.
df['Year'].value_counts()
df['Month'].value_counts()
df['Day_Name'].value_counts()
2019 185915 2020 34 Name: Year, dtype: int64
December 24984 October 20282 April 18279 November 17573 May 16566 March 15153 July 14292 June 13554 February 11975 August 11961 September 11621 January 9709 Name: Month, dtype: int64
Tuesday 27175 Sunday 26551 Monday 26547 Saturday 26491 Wednesday 26477 Thursday 26461 Friday 26247 Name: Day_Name, dtype: int64
In the Year feature there are only two values 2019, 2020. Because the 2020 value has such few records (34) out of (185915) we can't operate comparisons based on Year.
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 185949 entries, 0 to 185965 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order_ID 185949 non-null int64 1 Order_Date 185949 non-null datetime64[ns] 2 Day_Name 185949 non-null object 3 Month 185949 non-null object 4 Year 185949 non-null int64 5 Quarter 185949 non-null object 6 Season 185949 non-null object 7 Product_ean 185949 non-null float64 8 Purchase_Address 185949 non-null object 9 City 185949 non-null object 10 State_abbr 185949 non-null object 11 State_Code 185949 non-null object 12 Category 185949 non-null category 13 Product 185949 non-null category 14 Price_Each 185949 non-null float64 15 Quantity_Ordered 185949 non-null int64 16 turnover 185949 non-null float64 17 Cost_price 185949 non-null float64 18 margin 185949 non-null float64 dtypes: category(2), datetime64[ns](1), float64(5), int64(3), object(8) memory usage: 25.9+ MB
Now there are no null values in the entire dataframe.
print(df.dtypes)
Order_ID int64 Order_Date datetime64[ns] Day_Name object Month object Year int64 Quarter object Season object Product_ean float64 Purchase_Address object City object State_abbr object State_Code object Category category Product category Price_Each float64 Quantity_Ordered int64 turnover float64 Cost_price float64 margin float64 dtype: object
df['Day_Name'] = df['Day_Name'].astype('category')
df['Month'] = df['Month'].astype('category')
df['Quarter'] = df['Quarter'].astype('category')
df['Season'] = df['Season'].astype('category')
df['City'] = df['City'].astype('category')
df['State_abbr'] = df['State_abbr'].astype('category')
print(df.dtypes)
Order_ID int64 Order_Date datetime64[ns] Day_Name category Month category Year int64 Quarter category Season category Product_ean float64 Purchase_Address object City category State_abbr category State_Code object Category category Product category Price_Each float64 Quantity_Ordered int64 turnover float64 Cost_price float64 margin float64 dtype: object
plt.figure(figsize=(15,5))
sns.heatmap(df.corr(numeric_only = True),annot=True,cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()
<Figure size 1500x500 with 0 Axes>
<Axes: >
Text(0.5, 1.0, 'Correlation Matrix')
Prepping for the machine learing model some features would have to be dropped as they represent almost exactly the same thing due to very high correlation.
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 185949 entries, 0 to 185965 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order_ID 185949 non-null int64 1 Order_Date 185949 non-null datetime64[ns] 2 Day_Name 185949 non-null category 3 Month 185949 non-null category 4 Year 185949 non-null int64 5 Quarter 185949 non-null category 6 Season 185949 non-null category 7 Product_ean 185949 non-null float64 8 Purchase_Address 185949 non-null object 9 City 185949 non-null category 10 State_abbr 185949 non-null category 11 State_Code 185949 non-null object 12 Category 185949 non-null category 13 Product 185949 non-null category 14 Price_Each 185949 non-null float64 15 Quantity_Ordered 185949 non-null int64 16 turnover 185949 non-null float64 17 Cost_price 185949 non-null float64 18 margin 185949 non-null float64 dtypes: category(8), datetime64[ns](1), float64(5), int64(3), object(2) memory usage: 18.4+ MB
fig = px.pie(df, names = 'Category')
fig.show()
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
def preprocess_text(text):
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
tokens = word_tokenize(text)
stop_words = stopwords.words('english')
tokens = [token for token in tokens if token not in stop_words]
text = ' '.join(tokens)
return text
df['Category'] = df['Category'].apply(preprocess_text)
from collections import Counter
word_freq = Counter()
for text in df['Category']:
word_freq.update(text.split())
print(word_freq.most_common(10))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
[('accessories', 134599), ('appliances', 30131), ('phones', 12367), ('electronics', 8852)]
<Figure size 800x800 with 0 Axes>
<matplotlib.image.AxesImage at 0x231fb6ddf10>
(-0.5, 799.5, 399.5, -0.5)
df['Category'].value_counts()
accessories 134599 appliances 30131 phones 12367 electronics 8852 Name: Category, dtype: int64
theta = df['Category'].unique()
radii = df.groupby('Category')['margin'].sum()
plt.polar(theta, radii)
plt.title('Total Sales Amount by Product Category')
plt.xticks(rotation=45)
plt.show()
[<matplotlib.lines.Line2D at 0x231fb6f2910>]
Text(0.5, 1.0, 'Total Sales Amount by Product Category')
([0, 1, 2, 3], [Text(0, 0, 'phones'), Text(1, 0, 'accessories'), Text(2, 0, 'appliances'), Text(3, 0, 'electronics')])
Although accessories is the most frequently bought category and electronics is the least frequently bought. Electronics categpry has a high turnover to cost ratio thus leading to massive marginal profits.
df.pivot_table(index = ['Category', 'Product'], values = ['Quantity_Ordered', ], aggfunc = 'sum')
| Quantity_Ordered | ||
|---|---|---|
| Category | Product | |
| accessories | 20in Monitor | 0 |
| 27in 4K Gaming Monitor | 0 | |
| 27in FHD Monitor | 0 | |
| 34in Ultrawide Monitor | 0 | |
| AA Batteries (4-pack) | 27634 | |
| ... | ... | ... |
| electronics | ThinkPad Laptop | 4130 |
| USB-C Charging Cable | 0 | |
| Vareebadd Phone | 0 | |
| Wired Headphones | 0 | |
| iPhone | 0 |
76 rows × 1 columns
product_freq = Counter()
for text in df['Product']:
product_freq.update(text.split())
print(product_freq.most_common(10))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(product_freq)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
[('Headphones', 47756), ('Charging', 43561), ('Cable', 43561), ('Batteries', 41217), ('(4-pack)', 41217), ('Monitor', 24019), ('USB-C', 21903), ('Lightning', 21658), ('AAA', 20641), ('AA', 20576)]
<Figure size 800x800 with 0 Axes>
<matplotlib.image.AxesImage at 0x231fb8852d0>
(-0.5, 799.5, 399.5, -0.5)
px.histogram(df, x="Day_Name")
The highest recorded number of purchases was the highest in Tuesday with 27175 purchases (Highest Traffic) and the lowest was in Friday (Lowest Traffic) with 26247 purchases. ALthough there is not a massive difference between the highest and lowest days (928 purchases only).
px.histogram(df, x="Month")
The highest recorded number of purchases was the highest in December with 24984 purchases and the lowest was in January with 9079 purchases.
df.pivot_table(index = ['Category', 'Product'], values = ['Quantity_Ordered', ], aggfunc = 'sum')
| Quantity_Ordered | ||
|---|---|---|
| Category | Product | |
| accessories | 20in Monitor | 0 |
| 27in 4K Gaming Monitor | 0 | |
| 27in FHD Monitor | 0 | |
| 34in Ultrawide Monitor | 0 | |
| AA Batteries (4-pack) | 27634 | |
| ... | ... | ... |
| electronics | ThinkPad Laptop | 4130 |
| USB-C Charging Cable | 0 | |
| Vareebadd Phone | 0 | |
| Wired Headphones | 0 | |
| iPhone | 0 |
76 rows × 1 columns
There are only two phones sold which are the iphone and the Google Phone.
city_margin = df.groupby('City')['margin'].sum().nlargest(10)
city_margin
City San Francisco 5.133583e+06 Los Angeles 3.387181e+06 New York City 2.898378e+06 Boston 2.277973e+06 Atlanta 1.739016e+06 Dallas 1.724265e+06 Seattle 1.709025e+06 Portland 1.439438e+06 Austin 1.129208e+06 Name: margin, dtype: float64
plt.bar(city_margin.index, city_margin.values)
plt.xticks(rotation = 90)
plt.xlabel('City')
plt.ylabel('Total Profit')
plt.title('Total Profit by City')
plt.show()
<BarContainer object of 9 artists>
([0, 1, 2, 3, 4, 5, 6, 7, 8], [Text(0, 0, 'San Francisco'), Text(1, 0, 'Los Angeles'), Text(2, 0, 'New York City'), Text(3, 0, 'Boston'), Text(4, 0, 'Atlanta'), Text(5, 0, 'Dallas'), Text(6, 0, 'Seattle'), Text(7, 0, 'Portland'), Text(8, 0, 'Austin')])
Text(0.5, 0, 'City')
Text(0, 0.5, 'Total Profit')
Text(0.5, 1.0, 'Total Profit by City')
San Francisco brought in the largest profit of 5.13M while Austin brought in the least with 1.29M.
Advertisment should be focused on cities like Austin and Portland as they bring in the lowest profit for the company.
month_margin = df.groupby('Month')['margin'].sum().nlargest(10)
month_margin
Month December 2.869516e+06 October 2.325213e+06 April 2.109617e+06 November 1.983143e+06 May 1.965322e+06 March 1.741473e+06 July 1.642683e+06 June 1.602693e+06 August 1.396144e+06 February 1.369115e+06 Name: margin, dtype: float64
plt.bar(month_margin.index, month_margin.values)
plt.xticks(rotation = 90)
plt.xlabel('Month')
plt.ylabel('Total Profit')
plt.title('Total Profit by Month')
plt.show()
<BarContainer object of 10 artists>
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [Text(0, 0, 'December'), Text(1, 0, 'October'), Text(2, 0, 'April'), Text(3, 0, 'November'), Text(4, 0, 'May'), Text(5, 0, 'March'), Text(6, 0, 'July'), Text(7, 0, 'June'), Text(8, 0, 'August'), Text(9, 0, 'February')])
Text(0.5, 0, 'Month')
Text(0, 0.5, 'Total Profit')
Text(0.5, 1.0, 'Total Profit by Month')
December has the highest marginal with 2.87M in profit which makes sense as the most amount of purchases was also in December.
quarter_margin = df.groupby('Quarter')['margin'].sum().nlargest(10)
quarter_margin
Quarter Quarter4 7.177872e+06 Quarter2 5.677632e+06 Quarter3 4.339011e+06 Quarter1 4.243551e+06 Name: margin, dtype: float64
plt.bar(quarter_margin.index, quarter_margin.values)
plt.xticks(rotation = 90)
plt.xlabel('Quarter')
plt.ylabel('Total Profit')
plt.title('Total Profit by Quarter')
plt.show()
<BarContainer object of 4 artists>
([0, 1, 2, 3], [Text(0, 0, 'Quarter4'), Text(1, 0, 'Quarter2'), Text(2, 0, 'Quarter3'), Text(3, 0, 'Quarter1')])
Text(0.5, 0, 'Quarter')
Text(0, 0.5, 'Total Profit')
Text(0.5, 1.0, 'Total Profit by Quarter')
Quarter 4 has the highest marginal with 7.18M in profit while the lowest was in Quarter 1 with 4.24M.
season_margin = df.groupby('Season')['margin'].sum().nlargest(10)
season_margin
Season Spring 5.816412e+06 Autumn 5.608540e+06 Winter 5.371594e+06 Summer 4.641520e+06 Name: margin, dtype: float64
plt.bar(season_margin.index, season_margin.values)
plt.xticks(rotation = 90)
plt.xlabel('Season')
plt.ylabel('Total Profit')
plt.title('Total Profit by Season')
plt.show()
<BarContainer object of 4 artists>
([0, 1, 2, 3], [Text(0, 0, 'Spring'), Text(1, 0, 'Autumn'), Text(2, 0, 'Winter'), Text(3, 0, 'Summer')])
Text(0.5, 0, 'Season')
Text(0, 0.5, 'Total Profit')
Text(0.5, 1.0, 'Total Profit by Season')
The Spring has the highest marginal with 5.82M in profit while the lowest was in the Summer with 4.64M.
state = df['State_abbr'].value_counts().reset_index()
state.columns = ['State_abbr', 'Number of Purchases']
basemap = folium.Map()
city_map = px.choropleth(state, locations = state['State_abbr'], locationmode = 'USA-states', scope = 'usa',
color = state['Number of Purchases'], hover_name = state['State_abbr'])
city_map.show()
By far California has the most amount of purchases with 74.33k records.